/*
*$Id: math-sll.c,v 1.16 2006/10/31 21:09:42 andrewm Exp $
*
*Change: CHUI. Patch 1.15 to 1.16: fdelapena
*
*Purpose
*   A fixed point (31.32 bit) math library.
*
*Description
*   Floating point packs the most accuracy in the available bits, but it
*   often provides more accuracy than is required.  It is time consuming to
*   carry the extra precision around, particularly on platforms that don't
*   have a dedicated floating point processor.
*
*   This library is a compromise.  All math is done using the 64 bit signed
*   "long long" format (sll), and is not intended to be portable, just as
*   fast as possible.  Since "long long" is a elementary type, it can be
*   passed around without resorting to the use of pointers.  Since the
*   format used is fixed point, there is never a need to do time consuming
*   checks and adjustments to maintain normalized numbers, as is the case
*   in floating point.
*
*   Simply put, this library is limited to handling numbers with a whole
*   part of up to 2^31 - 1 = 2.147483647e9 in magnitude, and fractional
*   parts down to 2^-32 = 2.3283064365e-10 in magnitude.  This yields a
*   decent range and accuracy for many applications.
*
*IMPORTANT
*   No checking for arguments out of range (error).
*   No checking for divide by zero (error).
*   No checking for overflow (error).
*   No checking for underflow (warning).
*   Chops, doesn't round.
*
*Functions
*   sll dbl2sll(double x_pos)            double -> sll
*   double slldbl(sll x_pos)            sll -> double
*
*   sll slladd(sll x_pos, sll y_pos)        x_pos + y_pos
*   sll sllsub(sll x_pos, sll y_pos)        x_pos - y_pos
*   sll sllmul(sll x_pos, sll y_pos)        x_pos *y_pos
*   sll slldiv(sll x_pos, sll y_pos)        x_pos / y_pos
*
*   sll sllinv(sll v)            1 / x_pos
*   sll sllmul2(sll x_pos)            x_pos *2
*   sll sllmul4(sll x_pos)            x_pos *4
*   sll sllmul2n(sll x_pos, int n)        x_pos *2^n, 0 <=n <=31
*   sll slldiv2(sll x_pos)            x_pos / 2
*   sll slldiv4(sll x_pos)            x_pos / 4
*   sll slldiv2n(sll x_pos, int n)        x_pos / 2^n, 0 <=n <=31
*
*   sll sllcos(sll x_pos)            cos x_pos
*   sll sllsin(sll x_pos)            sin x_pos
*   sll slltan(sll x_pos)            tan x_pos
*   sll sllatan(sll x_pos)            atan x_pos
*
*   sll sllexp(sll x_pos)            e^x_pos
*   sll slllog(sll x_pos)            ln x_pos
*
*   sll sllpow(sll x_pos, sll y_pos)        x_pos^y_pos
*   sll sllsqrt(sll x_pos)            x_pos^(1 / 2)
*
*History
*   *Oct 31 2006 Kevin Rockel v1.16
*   - Fixed typo in sllatan()
*   - Clarified sllatan() scaling
*
*   *Aug 20 2002 Nicolas Pitre <nico@cam.org> v1.15
*   - Replaced all shifting assembly with C equivalents
*   - Reformated ARM asm and changed comments to begin with @
*   - Updated C version of sllmul()
*   - Removed the unsupported architecture #error - should be portable now
*
*   *Aug 17 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.14
*   - Fixed sign handling of ARM sllmul()
*   - Ported sllmul() to x86 - it can be inlined now
*   - Updated the sllmul() comments to reflect my changes
*   - Updated the header comments
*
*   *Aug 17 2002 Nicolas Pitre <nico@cam.org> v1.13
*   - Corrected and expanded upon Andrew's sllmul() comments
*   - Added in an non-optimal but portable C version of sllmul()
*
*   *Aug 16 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.12
*   - Added in corrected optimized sllmul() for ARM by Nicolas Pitre
*   - Changed comments on multiplication to describe Nicolas's method
*
*   *Jun 17 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.11
*   - Reverted optimized sllmul() for ARM because of bug
*
*   *Jun 17 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.10
*   - Added in optimized sllmul() for ARM by Nicolas Pitre
*   - Changed comments on multiplication to describe Nicolas's method
*   - Optimized multiplications and divisions by powers of 2
*
*   *Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.9
*   - Optimized multiplcations and divisions by powers of 2
*
*   *Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.8
*   - Consolidated constants
*   - Added macro for _slladd() _sllsub()
*   - Removed __inline__ from slladd() sllsub()
*   - Renamed umul() to ullmul() and made global
*   - Added function prototypes
*   - Corrected header comment about fractional range
*   - Added warning for non-Linux operating systems
*
*   *Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.7
*   - Corrected some i386 assembly comments
*   - Renamed calc_*() to _sll*()
*   - Moved _sllexp() closer to sllexp()
*
*   *Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.6
*   - Added sllmul2() sllmul4() sllmul2n() for i386
*   - Added slldiv2() slldiv4() slldiv2n() for i386
*   - Removed input constraints on sllmul2() sllmul4() sllmul2n() for ARM
*   - Removed input constraints on slldiv2() slldiv4() slldiv2n() for ARM
*   - Modified ARM assembly for WYSIWYG output
*   - Changed asm to __asm__
*
*   *Feb  5 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.5
*   - Fixed umul() for i386
*   - Fixed dbl2sll() and sll2dbl() - I forgot ARM doubles are big-endian
*   - Very lightly tested on ARM and i386 and it seems okay
*
*   *Feb  4 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.4
*   - Added umul() for i386
*
*   *Jan 20 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.3
*   - Added fast multiplication functions sllmul2(), sllmul4(), sllmul2n()
*   - Added fast division functions slldiv2() slldiv(), slldiv4n()
*   - Added square root function sllsqrt()
*   - Added library roll-call
*   - Reformatted the history to RPM format (ick)
*   - Moved sllexp() closer to related functions
*   - Added algorithm My_description to sllpow()
*
*   *Jan 19 2002 Andrew E. Mileski <andrewm@isoar.ca> v1.1
*   - Corrected constants, thanks to Mark A. Lisher for noticing
*   - Put source under CVS control
*
*   *Jan 18 2002 Andrew E. Mileski <andrewm@isoar.ca>
*   - Added some more explanation to calc_cos() and calc_sin()
*   - Added sllatan() and documented it fairly verbosely
*
*   *July 13 2000 Andrew E. Mileski <andrewm@isoar.ca>
*   - Corrected documentation for multiplication algorithm
*
*   *May 21 2000 Andrew E. Mileski <andrewm@isoar.ca>
*   - Rewrote slltanx() to avoid scaling argument for both sine and cosine
*
*   *May 19 2000  Andrew E. Mileski <andrewm@isoar.ca>
*   - Unrolled loops
*   - Added sllinv(), and sllneg()
*   - Changed constants to type "LL" (was "UL" - oops)
*   - Changed all routines to use inverse constants instead of division
*
*   *May 15, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
*   - Fixed slltan() - used sin/cos instead of sllsin/sllcos.  Doh!
*   - Added slllog(x_pos) and sllpow(x_pos,y_pos)
*
*   *May 11, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
*   - Added simple tan(x_pos) that could stand some optimization
*   - Added more constants (see <math.h>)
*
*   *May 3, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
*   - Added sllsin(), sllcos(), and trig constants
*
*   *May 2, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
*   - All routines and macros now have sll their identifiers
*   - Changed mul() to umul() and added sllmul() to handle signed numbers
*   - Added and tested sllexp(), sllint(), and sllfrac()
*   - Added some constants
*
*   *Apr 26, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
*   - Added mul(), and began testing it (unsigned only)
*
*   *Apr 25, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
*   - Added sll2dbl() [convert a signed long long to a double]
*   - Began testing.  Well gee whiz it works! :)
*
*   *Apr 24, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
*   - Added dbl2sll() [convert a double to signed long long]
*   - Began documenting
*
*   *Apr ??, 2000 - Andrew E. Mileski <andrewm@isoar.ca>
*   - Conceived, written, and fiddled with
*
*
*       Copyright (C) 2000 Andrew E. Mileski
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License version 2 as published by the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307  USA
*/

#ifndef MATHSLL_H
#define MATHSLL_H

#ifdef __cplusplus
extern "C"
{
#endif


#ifndef USE_FIXED_POINT

#include<math.h>

    typedef double sll;
    typedef double ull;

#define sllvalue(x_pos)     ((double)x_pos)
#define int2sll(x_pos)    ((sll)(x_pos))
#define sll2int(x_pos)    ((int)(x_pos))
#define sll_abs(x_pos)    (fabs(x_pos))
#define sllneg(x_pos)       (-(x_pos))


    static __inline__ double sll2dbl(sll x_pos)
    {
        return (double)x_pos;
    }

    static __inline__ sll slladd(sll x_pos, sll y_pos)
    {
        return x_pos + y_pos;
    }

    static __inline__ sll sllsub(sll x_pos, sll y_pos)
    {
        return x_pos - y_pos;
    }

    static __inline__ sll sllmul(sll x_pos, sll y_pos)
    {
        return x_pos *y_pos;
    }

    static __inline__ sll slldiv(sll x_pos, sll y_pos)
    {
        return x_pos / y_pos;
    }

    static __inline__ sll sllinv(sll x_pos)
    {
        return 1 / x_pos;
    }


    static __inline__ float sll2float(sll s)
    {
        return (float)s;
    }

    static __inline__ sll float2sll(float f)
    {
        return (sll)f;
    }

    static __inline__ sll sllsin(sll x_pos)
    {
        return sin(x_pos);
    }

    static __inline__ sll sllcos(sll x_pos)
    {
        return cos(x_pos);
    }

    static __inline__ sll slltan(sll x_pos)
    {
        return tan(x_pos);
    }

    static __inline__ sll sllsqrt(sll x_pos)
    {
        return sqrt(x_pos);
    }

    static __inline__ sll dbl2sll(double dbl)
    {
        return (float)dbl;
    }

    static __inline__ sll sllpow(sll x_pos, sll y_pos)
    {
        return pow(x_pos,y_pos);
    }

#else

    /*Data types */
    typedef signed long long sll;
    typedef Uint64 long ull;

    /*Macros */
#define int2sll(x_pos)    (((sll) (x_pos)) << 32)
#define sllvalue(x_pos)     (x_pos)
#define sll2int(x_pos)    ((int) ((x_pos) >> 32))
#define sll_abs(x_pos)    ((x_pos) & 0xefffffffffffffffLL)
#define sllint(x_pos)    ((x_pos) & 0xffffffff00000000LL)
#define sllfrac(x_pos)    ((x_pos) & 0x00000000ffffffffLL)
#define sllneg(x_pos)    (-(x_pos))
#define _slladd(x_pos,y_pos)    ((x_pos) + (y_pos))
#define _sllsub(x_pos,y_pos)    ((x_pos) - (y_pos))

    /*Constants (converted from double) */
#define CONST_0         0x0000000000000000LL
#define CONST_1         0x0000000100000000LL
#define CONST_2         0x0000000200000000LL
#define CONST_3         0x0000000300000000LL
#define CONST_4         0x0000000400000000LL
#define CONST_10        0x0000000a00000000LL
#define CONST_1_2       0x0000000080000000LL
#define CONST_1_3       0x0000000055555555LL
#define CONST_1_4       0x0000000040000000LL
#define CONST_1_5       0x0000000033333333LL
#define CONST_1_6       0x000000002aaaaaaaLL
#define CONST_1_7       0x0000000024924924LL
#define CONST_1_8       0x0000000020000000LL
#define CONST_1_9       0x000000001c71c71cLL
#define CONST_1_10      0x0000000019999999LL
#define CONST_1_11      0x000000001745d174LL
#define CONST_1_12      0x0000000015555555LL
#define CONST_1_20      0x000000000cccccccLL
#define CONST_1_30      0x0000000008888888LL
#define CONST_1_42      0x0000000006186186LL
#define CONST_1_56      0x0000000004924924LL
#define CONST_1_72      0x00000000038e38e3LL
#define CONST_1_90      0x0000000002d82d82LL
#define CONST_1_110     0x000000000253c825LL
#define CONST_1_132     0x0000000001f07c1fLL
#define CONST_1_156     0x0000000001a41a41LL
#define CONST_E         0x00000002b7e15162LL
#define CONST_1_E       0x000000005e2d58d8LL
#define CONST_SQRTE     0x00000001a61298e1LL
#define CONST_1_SQRTE   0x000000009b4597e3LL
#define CONST_LOG2_E    0x0000000171547652LL
#define CONST_LOG10_E   0x000000006f2dec54LL
#define CONST_LN2       0x00000000b17217f7LL
#define CONST_LN10      0x000000024d763776LL
#define CONST_PI        0x00000003243f6a88LL
#define CONST_PI_2      0x00000001921fb544LL
#define CONST_PI_4      0x00000000c90fdaa2LL
#define CONST_1_PI      0x00000000517cc1b7LL
#define CONST_2_PI      0x00000000a2f9836eLL
#define CONST_2_SQRTPI  0x0000000120dd7504LL
#define CONST_SQRT2     0x000000016a09e667LL
#define CONST_1_SQRT2   0x00000000b504f333LL

    static __inline__ sll slladd(sll x_pos, sll y_pos)
    {
        return (x_pos + y_pos);
    }

    static __inline__ sll sllsub(sll x_pos, sll y_pos)
    {
        return (x_pos - y_pos);
    }

    /*
    *Let a = A *2^32 + a_hi *2^0 + a_lo *2^(-32)
    *Let b = B *2^32 + b_hi *2^0 + b_lo *2^(-32)
    *
    *Where:
    *  *_hi is the integer part
    *  *_lo the fractional part
    *  A and B are the sign (0 for positive, -1 for negative).
    *
    *a *b=(A *2^32 + a_hi *2^0 + a_lo *2^-32)
    *      *(B *2^32 + b_hi *2^0 + b_lo *2^-32)
    *
    *Expanding the terms, we get:
    *
    *    = A *B *2^64 + A *b_h *2^32 + A *b_l *2^0
    *    + a_h *B *2^32 + a_h *b_h *2^0 + a_h *b_l *2^-32
    *    + a_l *B *2^0 + a_l *b_h *2^-32 + a_l *b_l *2^-64
    *
    *Grouping by powers of 2, we get:
    *
    *    = A *B *2^64
    *    Meaningless overflow from sign extension - ignore
    *
    *    + (A *b_h + a_h *B) *2^32
    *    Overflow which we can't handle - ignore
    *
    *    + (A *b_l + a_h *b_h + a_l *B) *2^0
    *    We only need the low 32 bits of this term, as the rest is overflow
    *
    *    + (a_h *b_l + a_l *b_h) *2^-32
    *    We need all 64 bits of this term
    *
    *    +  a_l *b_l *2^-64
    *    We only need the high 32 bits of this term, as the rest is underflow
    *
    *Note that:
    *  a > 0 && b > 0: A =  0, B =  0 and the third term is a_h *b_h
    *  a < 0 && b > 0: A=-1, B =  0 and the third term is a_h *b_h - b_l
    *  a > 0 && b < 0: A =  0, B=-1 and the third term is a_h *b_h - a_l
    *  a < 0 && b < 0: A=-1, B=-1 and the third term is a_h *b_h - a_l - b_l
    */
#if defined(__arm__)
    static __inline__ sll sllmul(sll left, sll right)
    {
        /*
        *From gcc/config/arm/arm.h:
        *  In a pair of registers containing a DI or DF value the 'Q'
        *  operand returns the register number of the register containing
        *  the least significant part of the value.  The 'R' operand returns
        *  the register number of the register containing the most
        *  significant part of the value.
        */
        sll retval;

        __asm__ (
            "@ sllmul\n\t"
            "umull    %R0, %Q0, %Q1, %Q2\n\t"
            "mul    %R0, %R1, %R2\n\t"
            "umlal    %Q0, %R0, %Q1, %R2\n\t"
            "umlal    %Q0, %R0, %Q2, %R1\n\t"
            "tst    %R1, #0x80000000\n\t"
            "subne    %R0, %R0, %Q2\n\t"
            "tst    %R2, #0x80000000\n\t"
            "subne    %R0, %R0, %Q1\n\t"
    : "=&r" (retval)
                    : "%r" (left), "r" (right)
                    : "cc"
                );

        return retval;
    }
#elif defined(__i386__)
    static __inline__ sll sllmul(sll left, sll right)
    {
        register sll retval;
        __asm__(
            "# sllmul\n\t"
            "    movl    %1, %%eax\n\t"
            "    mull     %3\n\t"
            "    movl    %%edx, %%ebx\n\t"
            "\n\t"
            "    movl    %2, %%eax\n\t"
            "    mull     %4\n\t"
            "    movl    %%eax, %%ecx\n\t"
            "\n\t"
            "    movl    %1, %%eax\n\t"
            "    mull    %4\n\t"
            "    addl    %%eax, %%ebx\n\t"
            "    adcl    %%edx, %%ecx\n\t"
            "\n\t"
            "    movl    %2, %%eax\n\t"
            "    mull    %3\n\t"
            "    addl    %%ebx, %%eax\n\t"
            "    adcl    %%ecx, %%edx\n\t"
            "\n\t"
            "    btl    $31, %2\n\t"
            "    jnc    1f\n\t"
            "    subl    %3, %%edx\n\t"
            "1:    btl    $31, %4\n\t"
            "    jnc    1f\n\t"
            "    subl    %1, %%edx\n\t"
            "1:\n\t"
    : "=&A" (retval)
                    : "m" (left), "m" (((unsigned *) &left)[1]),
                    "m" (right), "m" (((unsigned *) &right)[1])
                    : "ebx", "ecx", "cc"
                );
        return retval;
    }
#else
    /*Plain C version: not optimal but portable. */
#warning Fixed Point no optimal
    static __inline__ sll sllmul(sll a, sll b)
    {
        Uint32 a_lo, b_lo;
        signed int a_hi, b_hi;
        sll x_pos;

        a_lo = a;
        a_hi=(ull) a >> 32;
        b_lo = b;
        b_hi=(ull) b >> 32;

        x_pos=((ull) (a_hi *b_hi) << 32)
          + (((ull) a_lo *b_lo) >> 32)
          + (sll) a_lo *b_hi
          + (sll) b_lo *a_hi;

        return x_pos;
    }
#endif

    static __inline__ sll sllinv(sll v)
{
        int sgn = 0;
        sll u;
        ull s=-1;

        /*Use positive numbers, or the approximation won't work */
        if (v < CONST_0)
        {
            v = sllneg(v);
            sgn = 1;
        }

        /*An approximation - must be larger than the actual value */
        for (u = v; u; ((ull)u) >>=1)
            s >>=1;

        /*Newton's Method */
        u = sllmul(s, _sllsub(CONST_2, sllmul(v, s)));
        u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));
        u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));
        u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));
        u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));
        u = sllmul(u, _sllsub(CONST_2, sllmul(v, u)));

        return ((sgn) ? sllneg(u): u);
    }

    static __inline__ sll slldiv(sll left, sll right)
    {
        return sllmul(left, sllinv(right));
    }

    static __inline__ sll sllmul2(sll x_pos)
    {
        return x_pos << 1;
    }

    static __inline__ sll sllmul4(sll x_pos)
    {
        return x_pos << 2;
    }

    static __inline__ sll sllmul2n(sll x_pos, int n)
    {
        sll y_pos;

#if defined(__arm__)
        /*
        *On ARM we need to do explicit assembly since the compiler
        *doesn't know the range of n is limited and decides to call
        *a library function instead.
        */
        __asm__ (
            "@ sllmul2n\n\t"
            "mov    %R0, %R1, lsl %2\n\t"
            "orr    %R0, %R0, %Q1, lsr %3\n\t"
            "mov    %Q0, %Q1, lsl %2\n\t"
    : "=r" (y_pos)
                    : "r" (x_pos), "rM" (n), "rM" (32 - n)
                );
#else
    y_pos = x_pos << n;
#endif

        return y_pos;
    }

    static __inline__ sll slldiv2(sll x_pos)
{
        return x_pos >> 1;
    }

    static __inline__ sll slldiv4(sll x_pos)
    {
        return x_pos >> 2;
    }

    static __inline__ sll slldiv2n(sll x_pos, int n)
    {
        sll y_pos;

#if defined(__arm__)
        /*
        *On ARM we need to do explicit assembly since the compiler
        *doesn't know the range of n is limited and decides to call
        *a library function instead.
        */
        __asm__ (
            "@ slldiv2n\n\t"
            "mov    %Q0, %Q1, lsr %2\n\t"
            "orr    %Q0, %Q0, %R1, lsl %3\n\t"
            "mov    %R0, %R1, asr %2\n\t"
    : "=r" (y_pos)
                    : "r" (x_pos), "rM" (n), "rM" (32 - n)
                );
#else
    y_pos = x_pos >> n;
#endif

        return y_pos;
    }

    /*
    *Unpack the IEEE floating point double format and put it in fixed point
    *sll format.
    */
    static __inline__ sll dbl2sll(double dbl)
{
        union
        {
            double d;
            unsigned u[2];
            ull ull;
            sll sll;
        } in, retval;
        register unsigned exp;

        /*Move into memory as args might be passed in regs */
        in.d = dbl;

#if defined(__arm__)

        /*ARM architecture has a big-endian double */
        exp = in.u[0];
        in.u[0]=in.u[1];
        in.u[1]=exp;

#endif /*defined(__arm__) */

        /*Leading 1 is assumed by IEEE */
        retval.u[1]=0x40000000;

        /*Unpack the mantissa into the Uint64 */
        retval.u[1] |=(in.u[1] << 10) & 0x3ffffc00;
        retval.u[1] |=(in.u[0] >> 22) & 0x000003ff;
        retval.u[0]=in.u[0] << 10;

        /*Extract the exponent and align the decimals */
        exp=(in.u[1] >> 20) & 0x7ff;
        if (exp)
            retval.ull >>=1053 - exp;
        else
            return 0L;

        /*Negate if negative flag set */
        if (in.u[1] & 0x80000000)
            retval.sll=-retval.sll;

        return retval.sll;
    }

    static __inline__ sll float2sll(float f)
    {
        return dbl2sll((double)f);
    }

    static __inline__ double sll2dbl(sll s)
    {
        union
        {
            double d;
            unsigned u[2];
            ull ull;
            sll sll;
        } in, retval;
        register unsigned exp;
        register unsigned flag;

        if (s == 0)
            return 0.0;

        /*Move into memory as args might be passed in regs */
        in.sll = s;

        /*Handle the negative flag */
        if (in.sll < 1)
        {
            flag = 0x80000000;
            in.ull = sllneg(in.sll);
        }
        else
            flag = 0x00000000;

        /*Normalize */
        for (exp = 1053; in.ull && (in.u[1] & 0x80000000) == 0; exp--)
        {
            in.ull <<=1;
        }
        in.ull <<=1;
        exp++;
        in.ull >>=12;
        retval.ull = in.ull;
        retval.u[1] |=flag | (exp << 20);

#if defined(__arm__)

        /*ARM architecture has a big-endian double */
        exp = retval.u[0];
        retval.u[0]=retval.u[1];
        retval.u[1]=exp;

#endif /*defined(__arm__) */

        return retval.d;
    }

    static __inline__ float sll2float(sll s)
    {
        return ((float)sll2dbl(s));
    }

    /*
    *Calculate cos x_pos where -pi/4 <=x_pos <=pi/4
    *
    *Description:
    *   cos x_pos = 1 - x_pos^2 / 2! + x_pos^4 / 4! - ... + x_pos^(2N) / (2N)!
    *   Note that (pi/4)^12 / 12! < 2^-32 which is the smallest possible number.
    */
    static __inline__ sll _sllcos(sll x_pos)
    {
        sll retval, x2;
        x2 = sllmul(x_pos, x_pos);
        /*
        *cos x_pos = t0 + t1 + t2 + t3 + t4 + t5 + t6
        *
        *f0 =  0!= 1
        *f1 =  2!= 2 * 1 *f0 =   2 *f0
        *f2 =  4!= 4 * 3 *f1 =  12 x_pos f1
        *f3 =  6!= 6 * 5 *f2 =  30 *f2
        *f4 =  8!= 8 * 7 *f3 =  56 *f3
        *f5 = 10!=10 * 9 *f4 =  90 *f4
        *f6 = 12!=12 *11 *f5 = 132 *f5
        *
        *t0 = 1
        *t1=-t0 *x2 /   2=-t0 *x2 *CONST_1_2
        *t2=-t1 *x2 /  12=-t1 *x2 *CONST_1_12
        *t3=-t2 *x2 /  30=-t2 *x2 *CONST_1_30
        *t4=-t3 *x2 /  56=-t3 *x2 *CONST_1_56
        *t5=-t4 *x2 /  90=-t4 *x2 *CONST_1_90
        *t6=-t5 *x2 / 132=-t5 *x2 *CONST_1_132
        */
        retval = _sllsub(CONST_1, sllmul(x2, CONST_1_132));
        retval = _sllsub(CONST_1, sllmul(sllmul(x2, retval), CONST_1_90));
        retval = _sllsub(CONST_1, sllmul(sllmul(x2, retval), CONST_1_56));
        retval = _sllsub(CONST_1, sllmul(sllmul(x2, retval), CONST_1_30));
        retval = _sllsub(CONST_1, sllmul(sllmul(x2, retval), CONST_1_12));
        retval = _sllsub(CONST_1, slldiv2(sllmul(x2, retval)));
        return retval;
    }

    /*
    *Calculate sin x_pos where -pi/4 <=x_pos <=pi/4
    *
    *Description:
    *   sin x_pos = x_pos - x_pos^3 / 3! + x_pos^5 / 5! - ... + x_pos^(2N+1) / (2N+1)!
    *   Note that (pi/4)^13 / 13! < 2^-32 which is the smallest possible number.
    */
    static __inline__ sll _sllsin(sll x_pos)
    {
        sll retval, x2;
        x2 = sllmul(x_pos, x_pos);
        /*
        *sin x_pos = t0 + t1 + t2 + t3 + t4 + t5 + t6
        *
        *f0 =  0!= 1
        *f1 =  3!= 3 * 2 *f0 =   6 *f0
        *f2 =  5!= 5 * 4 *f1 =  20 x_pos f1
        *f3 =  7!= 7 * 6 *f2 =  42 *f2
        *f4 =  9!= 9 * 8 *f3 =  72 *f3
        *f5 = 11!=11 *10 *f4 = 110 *f4
        *f6 = 13!=13 *12 *f5 = 156 *f5
        *
        *t0 = 1
        *t1=-t0 *x2 /   6=-t0 *x2 *CONST_1_6
        *t2=-t1 *x2 /  20=-t1 *x2 *CONST_1_20
        *t3=-t2 *x2 /  42=-t2 *x2 *CONST_1_42
        *t4=-t3 *x2 /  72=-t3 *x2 *CONST_1_72
        *t5=-t4 *x2 / 110=-t4 *x2 *CONST_1_110
        *t6=-t5 *x2 / 156=-t5 *x2 *CONST_1_156
        */
        retval = _sllsub(x_pos, sllmul(x2, CONST_1_156));
        retval = _sllsub(x_pos, sllmul(sllmul(x2, retval), CONST_1_110));
        retval = _sllsub(x_pos, sllmul(sllmul(x2, retval), CONST_1_72));
        retval = _sllsub(x_pos, sllmul(sllmul(x2, retval), CONST_1_42));
        retval = _sllsub(x_pos, sllmul(sllmul(x2, retval), CONST_1_20));
        retval = _sllsub(x_pos, sllmul(sllmul(x2, retval), CONST_1_6));
        return retval;
    }

    static __inline__ sll sllcos(sll x_pos)
    {
        int i;
        sll retval;

        /*Calculate cos (x_pos - i *pi/2), where -pi/4 <=x_pos - i *pi/2 <=pi/4  */
        i = sll2int(_slladd(sllmul(x_pos, CONST_2_PI), CONST_1_2));
        x_pos = _sllsub(x_pos, sllmul(int2sll(i), CONST_PI_2));

        switch (i & 3)
        {
        default:
        case 0:
            retval = _sllcos(x_pos);
            break;
        case 1:
            retval = sllneg(_sllsin(x_pos));
            break;
        case 2:
            retval = sllneg(_sllcos(x_pos));
            break;
        case 3:
            retval = _sllsin(x_pos);
            break;
        }
        return retval;
    }

    static __inline__ sll sllsin(sll x_pos)
    {
        int i;
        sll retval;

        /*Calculate sin (x_pos - n *pi/2), where -pi/4 <=x_pos - i *pi/2 <=pi/4 */
        i = sll2int(_slladd(sllmul(x_pos, CONST_2_PI), CONST_1_2));
        x_pos = _sllsub(x_pos, sllmul(int2sll(i), CONST_PI_2));

        switch (i & 3)
        {
        default:
        case 0:
            retval = _sllsin(x_pos);
            break;
        case 1:
            retval = _sllcos(x_pos);
            break;
        case 2:
            retval = sllneg(_sllsin(x_pos));
            break;
        case 3:
            retval = sllneg(_sllcos(x_pos));
            break;
        }
        return retval;
    }

    static __inline__ sll slltan(sll x_pos)
    {
        int i;
        sll retval;
        i = sll2int(_slladd(sllmul(x_pos, CONST_2_PI), CONST_1_2));
        x_pos = _sllsub(x_pos, sllmul(int2sll(i), CONST_PI_2));
        switch (i & 3)
        {
        default:
        case 0:
        case 2:
            retval = slldiv(_sllsin(x_pos), _sllcos(x_pos));
            break;
        case 1:
        case 3:
            retval = sllneg(slldiv(_sllcos(x_pos), _sllsin(x_pos)));
            break;
        }
        return retval;
    }

    /*
    *atan x_pos = SUM[n = 0,) (-1)^n *x_pos^(2n + 1)/(2n + 1), |x_pos| < 1
    *
    *Two term approximation
    *   a = x_pos - x_pos^3/3
    *Gives us
    *   atan x_pos = a + ??
    *Let ??=arctan ?
    *   atan x_pos = a + arctan ?
    *Rearrange
    *   atan x_pos - a = arctan ?
    *Apply tan to both sides
    *   tan (atan x_pos - a)=tan arctan ?
    *   tan (atan x_pos - a)=?
    *Applying the standard formula
    *   tan (u - v)=(tan u - tan v) / (1 + tan u *tan v)
    *Gives us
    *   tan (atan x_pos - a)=(tan atan x_pos - tan a) / (1 + tan arctan x_pos *tan a)
    *Let t = tan a
    *   tan (atan x_pos - a)=(x_pos - t) / (1 + x_pos *t)
    *So finally
    *   arctan x_pos = a + arctan ((tan x_pos - t) / (1 + x_pos *t))
    *And the typical worst case is x_pos = 1.0 which converges in 3 iterations.
    */
    static __inline__ sll _sllatan(sll x_pos)
    {
        sll a, t, retval;

        /*First iteration */
        a = sllmul(x_pos, _sllsub(CONST_1, sllmul(x_pos, sllmul(x_pos, CONST_1_3))));
        retval = a;

        /*Second iteration */
        t = slldiv(_sllsin(a), _sllcos(a));
        x_pos = slldiv(_sllsub(x_pos, t), _slladd(CONST_1, sllmul(t, x_pos)));
        a = sllmul(x_pos, _sllsub(CONST_1, sllmul(x_pos, sllmul(x_pos, CONST_1_3))));
        retval = _slladd(retval, a);

        /*Third  iteration */
        t = slldiv(_sllsin(a), _sllcos(a));
        x_pos = slldiv(_sllsub(x_pos, t), _slladd(CONST_1, sllmul(t, x_pos)));
        a = sllmul(x_pos, _sllsub(CONST_1, sllmul(x_pos, sllmul(x_pos, CONST_1_3))));
        return _slladd(retval, a);
    }

    static __inline__ sll sllatan(sll x_pos)
    {
        sll retval;

        if (x_pos < sllneg(CONST_1))
            /*if x_pos < -1 then atan x_pos = PI/2 + atan 1/x_pos */
            retval = sllneg(CONST_PI_2);
        else if (x_pos > CONST_1)
            /*if x_pos > 1 then atan x_pos = PI/2 - atan 1/x_pos */
            retval = CONST_PI_2;
        else
            return _sllatan(x_pos);
        return _sllsub(retval, _sllatan(sllinv(x_pos)));
    }

    /*
    *Calculate e^x_pos where -0.5 <=x_pos <=0.5
    *
    *Description:
    *   e^x_pos = x_pos^0 / 0! + x_pos^1 / 1! + ... + x_pos^N / N!
    *   Note that 0.5^11 / 11! < 2^-32 which is the smallest possible number.
    */
    static __inline__ sll _sllexp(sll x_pos)
    {
        sll retval;
        retval = _slladd(CONST_1, sllmul(0, sllmul(x_pos, CONST_1_11)));
        retval = _slladd(CONST_1, sllmul(retval, sllmul(x_pos, CONST_1_11)));
        retval = _slladd(CONST_1, sllmul(retval, sllmul(x_pos, CONST_1_10)));
        retval = _slladd(CONST_1, sllmul(retval, sllmul(x_pos, CONST_1_9)));
        retval = _slladd(CONST_1, sllmul(retval, slldiv2n(x_pos, 3)));
        retval = _slladd(CONST_1, sllmul(retval, sllmul(x_pos, CONST_1_7)));
        retval = _slladd(CONST_1, sllmul(retval, sllmul(x_pos, CONST_1_6)));
        retval = _slladd(CONST_1, sllmul(retval, sllmul(x_pos, CONST_1_5)));
        retval = _slladd(CONST_1, sllmul(retval, slldiv4(x_pos)));
        retval = _slladd(CONST_1, sllmul(retval, sllmul(x_pos, CONST_1_3)));
        retval = _slladd(CONST_1, sllmul(retval, slldiv2(x_pos)));
        return retval;
    }

    /*
    *Calculate e^x_pos where x_pos is arbitrary
    */
    static __inline__ sll sllexp(sll x_pos)
    {
        int i;
        sll e, retval;

        e = CONST_E;

        /*-0.5 <=x_pos <=0.5  */
        i = sll2int(_slladd(x_pos, CONST_1_2));
        retval = _sllexp(_sllsub(x_pos, int2sll(i)));

        /*i >=0 */
        if (i < 0)
        {
            i=-i;
            e = CONST_1_E;
        }

        /*Scale the result */
        for (;i; i >>=1)
        {
            if (i & 1)
                retval = sllmul(retval, e);
            e = sllmul(e, e);
        }
        return retval;
    }

    /*
    *Calculate natural logarithm using Netwton-Raphson method
    */
    static __inline__ sll slllog(sll x_pos)
    {
        sll x1, ln = 0;

        /*Scale: e^(-1/2) <=x_pos <=e^(1/2) */
        while (x_pos < CONST_1_SQRTE)
        {
            ln = _sllsub(ln, CONST_1);
            x_pos = sllmul(x_pos, CONST_E);
        }
        while (x_pos > CONST_SQRTE)
        {
            ln = _slladd(ln, CONST_1);
            x_pos = sllmul(x_pos, CONST_1_E);
        }

        /*First iteration */
        x1 = sllmul(_sllsub(x_pos, CONST_1), slldiv2(_sllsub(x_pos, CONST_3)));
        ln = _sllsub(ln, x1);
        x_pos = sllmul(x_pos, _sllexp(x1));

        /*Second iteration */
        x1 = sllmul(_sllsub(x_pos, CONST_1), slldiv2(_sllsub(x_pos, CONST_3)));
        ln = _sllsub(ln, x1);
        x_pos = sllmul(x_pos, _sllexp(x1));

        /*Third iteration */
        x1 = sllmul(_sllsub(x_pos, CONST_1), slldiv2(_sllsub(x_pos, CONST_3)));
        ln = _sllsub(ln, x1);

        return ln;
    }

    /*
    *ln x_pos^y_pos = y_pos *log x_pos
    *e^(ln x_pos^y_pos)=e^(y_pos *log x_pos)
    *x_pos^y_pos = e^(y_pos *ln x_pos)
    */
    static __inline__ sll sllpow(sll x_pos, sll y_pos)
    {
        if (y_pos == CONST_0)
            return CONST_1;
        return sllexp(sllmul(y_pos, slllog(x_pos)));
    }

    /*
    *Consider a parabola centered on the y_pos-axis
    *    y_pos = a *x_pos^2 + b
    *Has zeros (y_pos = 0)  at
    *   a *x_pos^2 + b = 0
    *   a *x_pos^2=-b
    *   x_pos^2=-b / a
    *   x_pos=+- (-b / a)^(1 / 2)
    *Letting a = 1 and b=-x_pos
    *   y_pos = x_pos^2 - x_pos
    *   x_pos=+- x_pos^(1 / 2)
    *Which is convenient since we want to find the square root of x_pos, and we can
    *use Newton's Method to find the zeros of any f(x_pos)
    *   xn = x_pos - f(x_pos) / f'(x_pos)
    *Applied Newton's Method to our parabola
    *   f(x_pos)=x_pos^2 - x_pos
    *   xn = x_pos - (x_pos^2 - x_pos) / (2 *x_pos)
    *   xn = x_pos - (x_pos - x_pos / x_pos) / 2
    *To make this converge quickly, we scale x_pos so that
    *   x_pos = 4^N *z
    *Taking the roots of both sides
    *   x_pos^(1 / 2)=(4^n *z)^(1 / 2)
    *   x_pos^(1 / 2)=2^n *z^(1 / 2)
    *Let N = 2^n
    *   x_pos^(1 / 2)=N *z^(1 / 2)
    *We want this to converge to the positive root, so we must start at a point
    *   0 < start <=x_pos^(1 / 2)
    *or
    *   x_pos^(1/2) <=start <=infinity
    *since
    *   (1/2)^(1/2)=0.707
    *   2^(1/2)=1.414
    *A good choice is 1 which lies in the middle, and takes 4 iterations to
    *converge from either extreme.
    */
    static __inline__ sll sllsqrt(sll x_pos)
    {
        sll n, xn;

        /*Start with a scaling factor of 1 */
        n = CONST_1;

        /*Quick solutions for the simple cases */
        if (x_pos <=CONST_0 || x_pos == CONST_1)
            return x_pos;

        /*Scale x_pos so that 0.5 <=x_pos < 2 */
        while (x_pos >=CONST_2)
        {
            x_pos = slldiv4(x_pos);
            n = sllmul2(n);
        }
        while (x_pos < CONST_1_2)
        {
            x_pos = sllmul4(x_pos);
            n = slldiv2(n);
        }

        /*Simple solution if x_pos = 4^n */
        if (x_pos == CONST_1)
            return n;

        /*The starting point */
        xn = CONST_1;

        /*Four iterations will be enough */
        xn = _sllsub(xn, slldiv2(_sllsub(xn, slldiv(x_pos, xn))));
        xn = _sllsub(xn, slldiv2(_sllsub(xn, slldiv(x_pos, xn))));
        xn = _sllsub(xn, slldiv2(_sllsub(xn, slldiv(x_pos, xn))));
        xn = _sllsub(xn, slldiv2(_sllsub(xn, slldiv(x_pos, xn))));

        /*Scale the result */
        return sllmul(n, xn);
    }

#endif

#ifdef __cplusplus
}
#endif

#endif /*MATHSLL_H */
